Exploratory Data Analysis (EDA)¶
Load Dataset¶
In [1]:
import sys
import os
sys.path.append('../scripts')
from data_scraper import *
from analysis import *
from visualization import *
from data_processing import *
from sentiment_analysis import *
In [2]:
# path to the CSV file
filename = 'cleaned_reviews_df.csv'
path = os.path.join('..', 'data/02_intermediate', filename)
# Load dataset
user_review_df = load_dataset(path)
user_review_df['review_date'] = pd.to_datetime(user_review_df['review_date'])
In [3]:
display(user_review_df)
| reviewId | userName | content | score | thumbsUpCount | reviewCreatedVersion | review_date | appVersion | |
|---|---|---|---|---|---|---|---|---|
| 0 | e20abe49-8fe7-42fe-af3a-91399875b21a | Abbatu Ermias | Awe ሃረፍ nawe | 5 | 0 | 24.04.23 | 2024-05-21 05:06:05 | 24.04.23 |
| 1 | 20c4de51-f5cd-41f3-b21a-efdf8cb4463e | Eskender Million | It has a very complex, inconvenient to use, an... | 1 | 1 | 24.04.23 | 2024-05-17 20:37:40 | 24.04.23 |
| 2 | f7b29ba8-e90e-4dd6-bf6e-f1480b119456 | Leta Teshome | Never try Apollo. To be honest, this app will ... | 1 | 2 | 24.04.23 | 2024-05-17 00:40:28 | 24.04.23 |
| 3 | 250c64b7-029e-44d7-a9b2-7ce655198af7 | adonias addis | I like | 5 | 0 | 24.04.23 | 2024-05-16 20:36:19 | 24.04.23 |
| 4 | fc32f69e-fc81-4697-a29e-5d45c57fe945 | Lama Yig | This not more intractive and fast | 1 | 0 | 23.08.03 | 2024-05-16 07:32:51 | 23.08.03 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 644 | 65b305bf-ce99-4b29-908d-9e6dd7cdb2b8 | Lemi Tadesse | This app makes transactions easiest! I want to... | 5 | 181 | 23.01.16 | 2023-02-08 15:18:43 | 23.01.16 |
| 645 | 565193a5-77c9-46bd-9bc1-81e91eabbf76 | Ermiyas Abas Abdulahi | Excellent | 4 | 0 | 23.01.16 | 2023-02-05 20:14:31 | 23.01.16 |
| 646 | 3ffa88e8-7593-46e3-a804-ffb9436d3aaa | semale sefa | Dese aylem | 1 | 5 | 23.01.16 | 2023-02-04 16:27:05 | 23.01.16 |
| 647 | 5e598bd1-7a06-4f2c-af39-7022d8f630db | Abiy Addis | Honestly better than any other option. A bit s... | 4 | 4 | 23.01.16 | 2023-02-03 23:48:06 | 23.01.16 |
| 648 | 2bfae76a-ad5b-439d-b23e-0efdea2e5018 | Bonsa werkina | It seems best digital feature | 5 | 1 | 23.01.16 | 2023-02-03 13:52:37 | 23.01.16 |
649 rows × 8 columns
In [4]:
user_review_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 649 entries, 0 to 648 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 reviewId 649 non-null object 1 userName 649 non-null object 2 content 649 non-null object 3 score 649 non-null int64 4 thumbsUpCount 649 non-null int64 5 reviewCreatedVersion 649 non-null object 6 review_date 649 non-null datetime64[ns] 7 appVersion 649 non-null object dtypes: datetime64[ns](1), int64(2), object(5) memory usage: 40.7+ KB
Univariant Analysis¶
Numerical Features¶
In [5]:
uni_analysis = univariate_analysis(user_review_df)
display(uni_analysis)
| Mean | Median | Mode | Standard Deviation | Minimum | Maximum | |
|---|---|---|---|---|---|---|
| score | 3.394453 | 4.0 | 5 | 1.785150 | 1 | 5 |
| thumbsUpCount | 10.759630 | 1.0 | 0 | 61.488111 | 0 | 686 |
In [6]:
descriptive_stats(user_review_df)
Categorical Features¶
In [7]:
columns = ['reviewCreatedVersion', 'appVersion']
freq_counts = categorical_univariate_analysis(user_review_df, columns)
display(freq_counts)
| reviewCreatedVersion | appVersion | |
|---|---|---|
| reviewCreatedVersion | ||
| 24.03.04 | 132 | 132 |
| 23.10.24 | 81 | 81 |
| 23.03.15 | 58 | 58 |
| 24.01.17 | 50 | 50 |
| 23.12.29 | 41 | 41 |
| 23.09.23 | 41 | 41 |
| 23.12.18 | 36 | 36 |
| 24.04.23 | 34 | 34 |
| 23.01.16 | 32 | 32 |
| 24.02.14 | 30 | 30 |
| 23.07.08 | 25 | 25 |
| 23.08.03 | 20 | 20 |
| 23.12.09 | 17 | 17 |
| 24.01.30 | 14 | 14 |
| 23.06.17 | 11 | 11 |
| 23.07.24 | 8 | 8 |
| 24.02.09 | 6 | 6 |
| 23.03.03 | 6 | 6 |
| 23.02.28 | 3 | 3 |
| 22.10.27 | 2 | 2 |
| 23.07.22 | 1 | 1 |
| 23.10.18 | 1 | 1 |
In [8]:
visualize_categorical_univariate_analysis(freq_counts)
Bivariate / Multivariate Analysis¶
In [9]:
scatter_plot(user_review_df, "score", "thumbsUpCount")
Trends Overtime¶
In [10]:
analyze_review_trends(user_review_df)
Data Enrichment¶
In [11]:
user_review_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 649 entries, 0 to 648 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 reviewId 649 non-null object 1 userName 649 non-null object 2 content 649 non-null object 3 score 649 non-null int64 4 thumbsUpCount 649 non-null int64 5 reviewCreatedVersion 649 non-null object 6 review_date 649 non-null datetime64[ns] 7 appVersion 649 non-null object dtypes: datetime64[ns](1), int64(2), object(5) memory usage: 40.7+ KB
In [ ]:
Perform Sentiment analysis¶
In [12]:
sentiment_df = sentiment_analysis(user_review_df)
In [13]:
display(sentiment_df)
| reviewId | userName | content | score | thumbsUpCount | reviewCreatedVersion | review_date | appVersion | sentiment_score | sentiment | keywords | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e20abe49-8fe7-42fe-af3a-91399875b21a | Abbatu Ermias | awe ሃረፍ nawe | 5 | 0 | 24.04.23 | 2024-05-21 05:06:05 | 24.04.23 | 0.000000 | Neutral | awe, ሃረፍ, nawe |
| 1 | 20c4de51-f5cd-41f3-b21a-efdf8cb4463e | Eskender Million | it has a very complex, inconvenient to use, an... | 1 | 1 | 24.04.23 | 2024-05-17 20:37:40 | 24.04.23 | -0.228958 | Negative | it, to, i, a, for |
| 2 | f7b29ba8-e90e-4dd6-bf6e-f1480b119456 | Leta Teshome | never try apollo. to be honest, this app will ... | 1 | 2 | 24.04.23 | 2024-05-17 00:40:28 | 24.04.23 | 0.036111 | Positive | to, app, this, you, and |
| 3 | 250c64b7-029e-44d7-a9b2-7ce655198af7 | adonias addis | i like | 5 | 0 | 24.04.23 | 2024-05-16 20:36:19 | 24.04.23 | 0.000000 | Neutral | i, like |
| 4 | fc32f69e-fc81-4697-a29e-5d45c57fe945 | Lama Yig | this not more intractive and fast | 1 | 0 | 23.08.03 | 2024-05-16 07:32:51 | 23.08.03 | -0.025000 | Negative | this, not, more, intractive, and |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 644 | 65b305bf-ce99-4b29-908d-9e6dd7cdb2b8 | Lemi Tadesse | this app makes transactions easiest! i want to... | 5 | 181 | 23.01.16 | 2023-02-08 15:18:43 | 23.01.16 | 0.000595 | Positive | for, to, two, want, & |
| 645 | 565193a5-77c9-46bd-9bc1-81e91eabbf76 | Ermiyas Abas Abdulahi | excellent | 4 | 0 | 23.01.16 | 2023-02-05 20:14:31 | 23.01.16 | 1.000000 | Positive | excellent |
| 646 | 3ffa88e8-7593-46e3-a804-ffb9436d3aaa | semale sefa | dese aylem | 1 | 5 | 23.01.16 | 2023-02-04 16:27:05 | 23.01.16 | 0.000000 | Neutral | dese, aylem |
| 647 | 5e598bd1-7a06-4f2c-af39-7022d8f630db | Abiy Addis | honestly better than any other option. a bit s... | 4 | 4 | 23.01.16 | 2023-02-03 23:48:06 | 23.01.16 | 0.025000 | Positive | a, honestly, better, than, any |
| 648 | 2bfae76a-ad5b-439d-b23e-0efdea2e5018 | Bonsa werkina | it seems best digital feature | 5 | 1 | 23.01.16 | 2023-02-03 13:52:37 | 23.01.16 | 0.500000 | Positive | it, seems, best, digital, feature |
649 rows × 11 columns
Save dataset¶
In [14]:
sentiment_df['sentiment'] = sentiment_df['sentiment'].squeeze()
Visualize Sentiment Analysis¶
In [15]:
visualize_sentiment_analysis(sentiment_df)
In [16]:
output_folder = os.path.join('..', 'data/02_intermediate')
filename = 'sentiment_reviews_df.csv'
output_path = save_dataset(sentiment_df, output_folder, filename)
Dataset saved to ..\data/02_intermediate\sentiment_reviews_df.csv
Extract Bank Telegram ads data¶
Load All Banks Ads Data¶
In [17]:
filename = 'banks_telegram_posts_data.csv'
path = os.path.join('..', 'data/01_raw', filename)
banks_ads_data = load_dataset(path)
Get Bank data¶
In [18]:
bank_name = '#CBE'
bank_ads_data= extract_bank_ads__data(banks_ads_data, bank_name)
In [19]:
display(bank_ads_data)
| post_link | date | views | post_time | bank | time_of_day | |
|---|---|---|---|---|---|---|
| 0 | https://t.me/tikvahethiopia/70382 | 2022-05-19T13:47:14+00:00 | 310300 | 13:47:14 | #CBE | afternoon |
| 1 | https://t.me/tikvahethiopia/70383 | 2022-05-19T13:47:14+00:00 | 310300 | 13:47:14 | #CBE | afternoon |
| 2 | https://t.me/tikvahethiopia/73378 | 2022-08-30T11:13:05+00:00 | 267400 | 11:13:05 | #CBE | morning |
| 3 | https://t.me/tikvahethiopia/73379 | 2022-08-30T11:13:05+00:00 | 267400 | 11:13:05 | #CBE | morning |
| 4 | https://t.me/tikvahethiopia/74556 | 2022-11-03T16:07:35+00:00 | 347100 | 16:07:35 | #CBE | afternoon |
| ... | ... | ... | ... | ... | ... | ... |
| 128 | https://t.me/tikvahethiopia/86532 | 2024-03-31T20:03:32+00:00 | 306800 | 20:03:32 | #CBE | evening |
| 129 | https://t.me/tikvahethiopia/87070 | 2024-04-21T11:52:03+00:00 | 311800 | 11:52:03 | #CBE | morning |
| 130 | https://t.me/tikvahethiopia/87071 | 2024-04-21T11:52:03+00:00 | 311800 | 11:52:03 | #CBE | morning |
| 131 | https://t.me/tikvahethiopia/87116 | 2024-04-23T06:51:39+00:00 | 245400 | 06:51:39 | #CBE | morning |
| 132 | https://t.me/tikvahethiopia/87117 | 2024-04-23T06:51:39+00:00 | 245400 | 06:51:39 | #CBE | morning |
85 rows × 6 columns
In [20]:
bank_ads_data.info()
<class 'pandas.core.frame.DataFrame'> Index: 85 entries, 0 to 132 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 post_link 85 non-null object 1 date 85 non-null object 2 views 85 non-null int64 3 post_time 85 non-null object 4 bank 85 non-null object 5 time_of_day 85 non-null object dtypes: int64(1), object(5) memory usage: 4.6+ KB
Ad performance comparisons of different banks on Tikvah¶
In [21]:
ads_performance_df= calculate_ad_performance(banks_ads_data)
In [22]:
display(ads_performance_df)
| post_link | date | views | post_time | bank | time_of_day | ad_id | impressions | engagement_rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | https://t.me/tikvahethiopia/70382 | 2022-05-19T13:47:14+00:00 | 310300 | 13:47:14 | #CBE | afternoon | #CBE_https://t.me/tikvahethiopia/70382 | 310300 | 1.0 |
| 1 | https://t.me/tikvahethiopia/70383 | 2022-05-19T13:47:14+00:00 | 310300 | 13:47:14 | #CBE | afternoon | #CBE_https://t.me/tikvahethiopia/70383 | 310300 | 1.0 |
| 2 | https://t.me/tikvahethiopia/73378 | 2022-08-30T11:13:05+00:00 | 267400 | 11:13:05 | #CBE | morning | #CBE_https://t.me/tikvahethiopia/73378 | 267400 | 1.0 |
| 3 | https://t.me/tikvahethiopia/73379 | 2022-08-30T11:13:05+00:00 | 267400 | 11:13:05 | #CBE | morning | #CBE_https://t.me/tikvahethiopia/73379 | 267400 | 1.0 |
| 4 | https://t.me/tikvahethiopia/74556 | 2022-11-03T16:07:35+00:00 | 347100 | 16:07:35 | #CBE | afternoon | #CBE_https://t.me/tikvahethiopia/74556 | 347100 | 1.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 130 | https://t.me/tikvahethiopia/87071 | 2024-04-21T11:52:03+00:00 | 311800 | 11:52:03 | #CBE | morning | #CBE_https://t.me/tikvahethiopia/87071 | 311800 | 1.0 |
| 131 | https://t.me/tikvahethiopia/87116 | 2024-04-23T06:51:39+00:00 | 245400 | 06:51:39 | #CBE | morning | #CBE_https://t.me/tikvahethiopia/87116 | 245400 | 1.0 |
| 132 | https://t.me/tikvahethiopia/87117 | 2024-04-23T06:51:39+00:00 | 245400 | 06:51:39 | #CBE | morning | #CBE_https://t.me/tikvahethiopia/87117 | 245400 | 1.0 |
| 133 | https://t.me/tikvahethiopia/87629 | 2024-05-13T17:11:15+00:00 | 259200 | 17:11:15 | #Hibretbank | afternoon | #Hibretbank_https://t.me/tikvahethiopia/87629 | 259200 | 1.0 |
| 134 | https://t.me/tikvahethiopia/87769 | 2024-05-20T06:25:35+00:00 | 261700 | 06:25:35 | #Hibretbank | morning | #Hibretbank_https://t.me/tikvahethiopia/87769 | 261700 | 1.0 |
135 rows × 9 columns
Visualize Add Performance¶
In [23]:
plot_ad_performance(ads_performance_df)
Analyzing Optimal Ad Placement¶
In [24]:
analyze_optimal_ad_placement(banks_ads_data)
Analyze Number of Posts per Time of Day¶
In [25]:
ad_per_time_of_day(banks_ads_data)
Play Store review sentiment¶
Key Streangths and Weaknesses¶
In [26]:
get_key_strengths_and_weaknesses(sentiment_df)
In [ ]:
In [27]:
get_sentiment_across_time(sentiment_df)
In [ ]:
In [28]:
get_sentiment_and_version_updates(sentiment_df)
In [ ]: